#This script test for association between base-paired sites within HCSS and complementarily coevolving sites

#==========================================Importing modules====================================================
import sys
import time
import csv
import os
import time
import shutil
from Bio import SeqIO
import subprocess
from subprocess import *


#========================================== Listing the paths ==================================================
#Current working directory. This directory contain all necessary scripts for this analyses.
PATH_OUT=os.getcwd()+"/"	#Change this path to where you want to place the results

#Path to the complementary coevolution results. The results contain files that indicate which sites 
#significant (p-value <0.05) complementarily coevolution was detected.
PATH_IN0="C:/eclipse/workspace/Covariation_Analysis/GCV_Sep/win100GU/"
PATH_IN1="C:/eclipse/workspace/Covariation_Analysis/PCV_Sep/win100GU/"
PATH_IN2="C:/eclipse/workspace/Covariation_Analysis/BFDV_Sep/win100GU/"
PATH_IN3="C:/eclipse/workspace/Covariation_Analysis/DCV_Sep/win100GU/"
PATH_IN4="C:/eclipse/workspace/Covariation_Analysis/DG_CV_Sep/win100GU/"
PATH_IN5="C:/eclipse/workspace/Covariation_Analysis/Parvo_a/output/"
PATH_IN6="C:/eclipse/workspace/Covariation_Analysis/Parvo_b/output/"
PATH_IN7="C:/eclipse/workspace/Covariation_Analysis/Parvo_c/output/"
PATH_IN8="C:/eclipse/workspace/Covariation_Analysis/Parvo_e/output/"
PATH_IN9="C:/eclipse/workspace/Covariation_Analysis/Anello_a/output/"
PATH_IN10="C:/eclipse/workspace/Covariation_Analysis/Anello_b/output/"
PATH_IN11="C:/eclipse/workspace/Covariation_Analysis/Anello_c/output/"
PATH_IN12="C:/eclipse/workspace/Covariation_Analysis/Anello_e/output/"
PATH_IN13="C:/eclipse/workspace/Covariation_Analysis/BFDV_Full_Sep/output/"
PATH_IN14="C:/eclipse/workspace/Covariation_Analysis/Dicot1_Sep/output/"
PATH_IN15="C:/eclipse/workspace/Covariation_Analysis/BBTV_R_Sep/output/"
PATH_IN16="C:/eclipse/workspace/Covariation_Analysis/BBTV_N_Sep/output/"
PATH_IN17="C:/eclipse/workspace/Covariation_Analysis/BBTV_S_Sep/output/"
PATH_IN18="C:/eclipse/workspace/Covariation_Analysis/BBTV_C_Sep/output/"
PATH_IN19="C:/eclipse/workspace/Covariation_Analysis/MSV_Sep/output/"
PATH_IN20="C:/eclipse/workspace/Covariation_Analysis/Dicot2_Sep/output/"
PATH_IN21="C:/eclipse/workspace/Covariation_Analysis/PanSV_Sep/output/"
PATH_IN22="C:/eclipse/workspace/Covariation_Analysis/WDV_Sep/output/"
PATH_IN23="C:/eclipse/workspace/Covariation_Analysis/Begomo5_Sep/output/"
PATH_IN24="C:/eclipse/workspace/Covariation_Analysis/Begomo6_Sep/output/"
PATH_IN25="C:/eclipse/workspace/Covariation_Analysis/Begomo9_Sep/output/"
PATH_IN26="C:/eclipse/workspace/Covariation_Analysis/BBTV_M_Sep/output/"
PATH_IN27="C:/eclipse/workspace/Covariation_Analysis/PiCV_Sep/output/"
PATH_IN28="C:/eclipse/workspace/Covariation_Analysis/rubella/output/"

#Paths to the secondary structure prediction results. The files contain information about which sites
#are within the HCSS and which are not within the HCSS 
PATH_INs_STRUCT0="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Circo/"
PATH_INs_STRUCT1="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Circo/"
PATH_INs_STRUCT2="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Circo/"
PATH_INs_STRUCT3="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Circo/"
PATH_INs_STRUCT4="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Circo/"
PATH_INs_STRUCT5="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Parvo/"
PATH_INs_STRUCT6="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Parvo/"
PATH_INs_STRUCT7="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Parvo/"
PATH_INs_STRUCT8="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Parvo/"
PATH_INs_STRUCT9="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Anello/"
PATH_INs_STRUCT10="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Anello/"
PATH_INs_STRUCT11="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Anello/"
PATH_INs_STRUCT12="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Anello/"
PATH_INs_STRUCT13="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_BFDV_184/"
PATH_INs_STRUCT14="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Geminivirus/"
PATH_INs_STRUCT15="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Nanovirus/"
PATH_INs_STRUCT16="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Nanovirus/"
PATH_INs_STRUCT17="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Nanovirus/"
PATH_INs_STRUCT18="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Nanovirus/"
PATH_INs_STRUCT19="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Geminivirus/"
PATH_INs_STRUCT20="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Geminivirus/"
PATH_INs_STRUCT21="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Geminivirus/"
PATH_INs_STRUCT22="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Geminivirus/"
PATH_INs_STRUCT23="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Geminivirus/"
PATH_INs_STRUCT24="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Geminivirus/"
PATH_INs_STRUCT25="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Geminivirus/"
PATH_INs_STRUCT26="C:/eclipse/workspace/Covariation_Analysis/BBTV_M_Sep/output/"
PATH_INs_STRUCT27="C:/eclipse/workspace/Syn_Sub_Analysis/Infiles_Circo/"
PATH_INs_STRUCT28="C:/eclipse/workspace/Covariation_Analysis/rubella/output/"

#Putting all paths into one list for easy access.
PATH_LIST=[PATH_IN0,PATH_IN1,PATH_IN2,PATH_IN3,PATH_IN4,PATH_IN5 ,PATH_IN6,PATH_IN7,PATH_IN8,PATH_IN9,PATH_IN10,PATH_IN11,PATH_IN12,PATH_IN13,PATH_IN14,PATH_IN15,PATH_IN16,PATH_IN17,PATH_IN18,PATH_IN19,PATH_IN20,PATH_IN21,PATH_IN22,PATH_IN23,PATH_IN24,PATH_IN25,PATH_IN26,PATH_IN27,PATH_IN28]
PATH_LIST_INs_STRUCT=[PATH_INs_STRUCT0,PATH_INs_STRUCT1,PATH_INs_STRUCT2,PATH_INs_STRUCT3,PATH_INs_STRUCT4,PATH_INs_STRUCT5 ,PATH_INs_STRUCT6,PATH_INs_STRUCT7,PATH_INs_STRUCT8,PATH_INs_STRUCT9,PATH_INs_STRUCT10,PATH_INs_STRUCT11,PATH_INs_STRUCT12,PATH_INs_STRUCT13,PATH_INs_STRUCT14,PATH_INs_STRUCT15,PATH_INs_STRUCT16,PATH_INs_STRUCT17,PATH_INs_STRUCT18,PATH_INs_STRUCT19,PATH_INs_STRUCT20,PATH_INs_STRUCT21,PATH_INs_STRUCT22,PATH_INs_STRUCT23,PATH_INs_STRUCT24,PATH_INs_STRUCT25,PATH_INs_STRUCT26,PATH_INs_STRUCT27,PATH_INs_STRUCT28]

#Input file name list

#This list should contain the following
#	  = ["the small dataset alignment used for NASP","The obtained CT file from NASP","The original full alignment used for coevolution"]
                 
LsArg0=[PATH_INs_STRUCT0+"GCV_10Seq_d0.fas", PATH_IN0+"GCVSep_d0_0.05.ct", PATH_IN0 +"GCV_10Sep.fas"]
LsArg1=[PATH_INs_STRUCT1+"PCV_9Seq_.fas",PATH_INs_STRUCT1+"PCV_9Seq_0.05.ct", PATH_IN1+"PCVSep.fas"]
LsArg2=[PATH_INs_STRUCT2+"BFDV_10Seq_d0.fas", PATH_IN2+"BFDVSep.fas"]
LsArg3=[PATH_INs_STRUCT3+"DCV_10Seq_d0.fas", PATH_IN3+"DCVSep.fas"]
LsArg4=[PATH_INs_STRUCT4+"DG_CV_10Seq.fas",PATH_INs_STRUCT4+"DG_CV_10Seq_0.05.ct", PATH_IN4+"DG_CVSep.fas"]

LsArg5=[PATH_INs_STRUCT5+"a_10Seq.fas",PATH_INs_STRUCT5+"a_10Seq_0.05.ct", PATH_IN5+"Parvo_a_full.fas"]
LsArg6=[PATH_INs_STRUCT6+"c_10Seq.fas","c_10Seq_0.05.ct", PATH_IN6+"Parvo_bE.fas"]
LsArg7=[PATH_INs_STRUCT7+"c_10Seq.fas",PATH_INs_STRUCT7+"c_10Seq_0.05.ct", PATH_IN7+"Parvo_cE_5231_full.fas"]
LsArg8=[PATH_INs_STRUCT8+"e_10Seq.fas",PATH_INs_STRUCT8+"e_10Seq_0.05.ct", PATH_IN8+"Parvo_eE.fas"]

LsArg9=[PATH_INs_STRUCT9+"a_10Seq.fas", PATH_INs_STRUCT9+"a_10Seq_0.05.ct", PATH_IN9+"Anello_a_full.fas"]
LsArg10=[PATH_INs_STRUCT10+"b_10Seq.fas", PATH_IN10+"b_10Seq_0.05.ct", PATH_IN10 +"Anello_b_full.fas"]
LsArg11=[PATH_INs_STRUCT11+"c_10Seq.fas",PATH_INs_STRUCT11+"c_10Seq_0.05.ct", PATH_IN11+"Anello_c_full_sep.fas"]
LsArg12=[PATH_INs_STRUCT12+"e_10Seq.fas",PATH_INs_STRUCT12+"e_10Seq_0.05.ct", PATH_IN12+"Anello_eE.fas"]
LsArg13=[PATH_INs_STRUCT13+"BFDV_184_10seq.fas",PATH_INs_STRUCT13+"BFDV_184_10seq_0.05.ct", PATH_IN13+"BFDV_Full.fas""]

LsArg14=[PATH_INs_STRUCT14+"Dicot1_10seq_.fas",PATH_INs_STRUCT14+"Dicot1_10seq__0.05.ct", PATH_IN14+"Dicot1_full.fas"]
LsArg15=[PATH_INs_STRUCT15+"DNA_R_10Seq.fas",PATH_INs_STRUCT15+"DNA_R_10Seq_0.05.ct", PATH_IN15+"BBTV_R_full.fas"]
LsArg16=[PATH_INs_STRUCT16+"DNA_N_10Seq.fas",PATH_INs_STRUCT16+"DNA_N_10Seq_0.05.ct", PATH_IN16+"BBTV_N_full.fas"]
LsArg17=[PATH_INs_STRUCT17+"DNA_S_10Seq.fas",PATH_INs_STRUCT17+"DNA_S_10Seq_0.05.ct", PATH_IN17+"BBTV_S_full.fas"]
LsArg18=[PATH_INs_STRUCT18+"DNA_C_10Seq.fas",PATH_INs_STRUCT18+"DNA_C_10Seq_0.05.ct", PATH_IN18+"BBTV_C_full.fas"]

LsArg19=[PATH_INs_STRUCT19+"MSV_10seq_.fas",PATH_INs_STRUCT19+"MSV_10seq__0.05.ct", PATH_IN19+"MSV_full.fas"]
LsArg20=[PATH_INs_STRUCT20+"Dicot2_10seq_.fas",PATH_INs_STRUCT20+"Dicot2_10seq__0.05.ct", PATH_IN20+"Dicot2_full.fas"]
LsArg21=[PATH_INs_STRUCT21+"PanSV_10seq_.fas",PATH_INs_STRUCT21+"PanSV_10seq__0.05.ct", PATH_IN21+"PanSV_full.fas"]
LsArg22=[PATH_INs_STRUCT22+"WDV_10seq_.fas",PATH_INs_STRUCT22+"WDV_10seq__0.05.ct", PATH_IN22+"prof_WDV_full_E.fas"]
LsArg23=[PATH_INs_STRUCT23+"Begomo5_10seq.fas",PATH_INs_STRUCT23+"Begomo5_10seq_0.05.ct", PATH_IN23+"prof_Begomo5_full.fas"]
LsArg24=[PATH_INs_STRUCT24+"Begomo6_10seq.fas",PATH_INs_STRUCT24+"Begomo6_10seq_0.05.ct", PATH_IN24+"prof_Begomo6_full.fas"]
LsArg25=[PATH_INs_STRUCT25+"Begomo9_10seq.fas",PATH_INs_STRUCT25+"Begomo9_10seq_0.05.ct", PATH_IN25+"Begomo9_full.fas"]
LsArg26=[PATH_INs_STRUCT26+"DNA_M_10Seq.fas",PATH_INs_STRUCT26+"DNA_M_10Seq_0.05.ct" PATH_IN26+"DNA_M_full.fas"]
LsArg27=[PATH_INs_STRUCT27+"PiCV_10seq_.fas",PATH_INs_STRUCT27+"PiCV_10seq__0.05.ct", PATH_IN27+"prof_PiCV_full.fas"]
LsArg28=[PATH_INs_STRUCT28+"rubella_10seq_aligned__.fas","rubella_10seq_aligned___0.05.ct", PATH_IN28+"rubella_CG_34_aln_26.02_LC.fasta"]

#Putting all input file name lists into one list for easy access.
LsArgAll=[LsArg0, LsArg1, LsArg2, LsArg3, LsArg4, LsArg5, LsArg6, LsArg7, LsArg8, LsArg9, LsArg10, LsArg11, LsArg12, LsArg13,LsArg14,LsArg15,LsArg16,LsArg17,LsArg18,LsArg19,LsArg20,LsArg21,LsArg22,LsArg23,LsArg24,LsArg25,LsArg26,LsArg27,LsArg28]

#============================================ Defining the functions ===========================================================
#This function creates a mapping that maps the alingment used for secondary structure prediction and the alignment used for
#complementary coevolution detection.
def CreateMapping2(coev_ref,prof_fas,Coev_l):
    recs=[]
    seq_before=""
    seq_after=""
    id_before=""
    id_after=""
    #h=open(ref_fas,'r')]
    h=open(coev_ref,'r')
    for i in SeqIO.parse(h,'fasta'): #get the sequence from 10seq fas
        mstring=str(i.id)
        if mstring.find("Reference2")!=-1:
            id_before=str(i.id)
            seq_before=str(i.seq)
    h.close()
    
    h1=open(prof_fas,"r")
    for i in SeqIO.parse(h1,"fasta"):
        if str(i.id)==(id_before):
            seq_after=str(i.seq)
    rec_len=len(seq_before)
    new_Coev=[]                    #mapping the paired position to the profile alignment
    index=0
    for i in range(len(seq_after)):
        if seq_after[i]=="-":
            new_Coev.append("-")
        else:
            new_Coev.append(Coev_l[index])
            index+=1
    return new_Coev

#This function mapps the secondary structutre prediction output to the alingment
def CreateMapping1(ref_fas,prof_fas, ctf_ct):
    recs=[]
    seq_before=""
    seq_after=""
    id_before=""
    id_after=""
    #h=open(ref_fas,'r')]
    h=open(ref_fas,'r')
    for i in SeqIO.parse(h,'fasta'): #get the sequence from 10seq fas
        mstring=str(i.id)
        if mstring.find("Reference")!=-1:
            id_before=str(i.id)
            seq_before=str(i.seq)
    h.close()
    
    h1=open(prof_fas,"r")
    for i in SeqIO.parse(h1,"fasta"):
        if str(i.id)==(id_before):
            seq_after=str(i.seq)
    
    rec_len=len(seq_before)
    rec_list=[0*i for i in range(rec_len)]              #getting the paired position from the ct files
    file_data=[]
    h_ct=open(ctf_ct,"r")
    file_data=h_ct.readlines()
    for i in range(len(file_data)):
        [3]
        if "ENERGY" in file_data[i]:
            continue
        elif int(file_data[i].split("\t")[4])!=0:
            
            rec_list[int(file_data[i].split("\t")[4])-1]=1   
    h_ct.close()
    newMapp=[]                    #mapping the paired position to the profile alignment
    index=0
    for i in range(len(seq_after)):
        if seq_after[i]=="-":
            newMapp.append(-1)
        else:
            newMapp.append(rec_list[index])
            index+=1
    return newMapp

#Performing profile alignment for the small dataset alignment used for secondary structure prediction
#and the large dataset alignment used for complementary coevolution analysis.
def Profile_Algn(import_in1fas,import_in2fas,PATH_IN_CT,Coev_List):
	Ref_Fas_Name="ref_"+import_in1fas.split("/")[-1]
	Full_Fas_Name="full_"+import_in2fas.split("/")[-1]
	Prof_Fas_Name="prof_"+import_in2fas.split("/")[-1]
	
	shutil.copy(import_in1fas,os.getcwd()+"/"+Ref_Fas_Name)
	shutil.copy(import_in2fas,os.getcwd()+"/"+Full_Fas_Name)
	
	print "performing the profile alignment"
	h1list=[]
	h2list=[]
	h1=open(Ref_Fas_Name,"r")
	h2=open(Full_Fas_Name,"r")
	for rec in SeqIO.parse(h1,"fasta"):
		seq_mut=rec.seq.tomutable()
		for j in range(len(seq_mut)):
			if seq_mut[j]=="-":
				seq_mut[j]="N"
		rec.seq=seq_mut
		h1list.append(rec)
	h1.close()
	os.remove(Ref_Fas_Name)
	
	for rec1 in SeqIO.parse(h2,"fasta"):
		seq_mut1=rec1.seq.tomutable()
		for j in range(len(seq_mut1)):
			if seq_mut1[j]=="-":
				seq_mut1[j]="N"
		rec1.seq=seq_mut1
		h2list.append(rec1)
	h2.close()
	os.remove(Full_Fas_Name)
	  
	hrefout1=open(Ref_Fas_Name,"w")
	h1list[0].id="Reference"
	SeqIO.write(h1list,hrefout1,"fasta")                        
	hrefout1.close()
	hrefout2=open(Full_Fas_Name,"w")
	h2list[0].id="Reference2"
	SeqIO.write(h2list,hrefout2,"fasta")                        
	hrefout2.close()   
    #Running an alignment program
	#Change the path to muscle executable file (muscle3.8.31_i86win32.exe)
	cmd=['C:/My Programs/Muscle/muscle3.8.31_i86win32.exe','-profile','-in1', Full_Fas_Name, '-in2',Ref_Fas_Name,'-out', Prof_Fas_Name]
	print cmd
	proc=Popen(cmd,stdout=PIPE,stderr=PIPE)
	(output, error)=proc.communicate()
	return_code = proc.wait()
	if return_code != 0:
		sys.stderr.write('Error occured running Muscle')
	else:           
		print "profile alignment completed"
	#Creating the mappings
	mapp1=CreateMapping1(Ref_Fas_Name,Prof_Fas_Name,PATH_IN_CT) 
	mapp2=CreateMapping2(Full_Fas_Name,Prof_Fas_Name,Coev_List) 
	return (mapp1,mapp2)

#This function collects coevolution results and base-pairing results, performs a profile alingnment, mapps the results
#and runs a chisquared test.
def get_coevolution_info(path,f_10_fas,f_10_ct,coev_full_file):
	#Collecting coevolution results
	records=[rec for rec in SeqIO.parse(open(coev_full_file),"fasta")]
	coev_string=["no"]*len(records[0].seq)
	coev_out_files=[f for f in os.listdir(path) if f.endswith("_100.txt")]
	print coev_full_file
	for f in coev_out_files:
		h=open(path+f)
		lines=h.readlines()
		h.close()
		for l in range(2,len(lines)):
			if lines[l].split("\t")[10]=="I": 
				continue
			if lines[l].split("\t")[10]=="NA":
				continue
			if lines[l].split("\t")[12]=="I": 
				continue	
			if lines[l].split("\t")[12]=="NA": 
				continue
			if float(lines[l].split("\t")[10]) <0.05 and float(lines[l].split("\t")[12])>1:
				coev_string[int(f.split("_")[1])]="yes"
				break	
	#Profile alignment for pairing and coevolution alignment
	mapp=Profile_Algn(f_10_fas,coev_full_file, f_10_ct,coev_string)
	if len(mapp[1])==len(mapp[0]):
		hout=open(f_10_fas.split("/")[-1].split(".")[0]+"_table.txt","w")
		hout2=open("tbl.txt","w")
		hout.write("Paired\tCoev")
		hout2.write("Paired\tCoev")
		#Mapping the pairing results to coevolution results
		for i in range(len(mapp[0])):
			var=""
			if mapp[0][i]==-1 or mapp[1][i]=="-":
				continue
			else:
				if mapp[0][i]==1:
					var="yes"
				if mapp[0][i]==-1:
					var="-"
				if mapp[0][i]==0:
					var="no"
				hout.write("\n"+var+"\t"+mapp[1][i])
				hout2.write("\n"+var+"\t"+mapp[1][i])
		hout.close()
		hout2.close()
		#Running the chisquared test
		#Change the path to R exectutable file (Rscript.exe)
		cmd=['C:/Program Files/R/R-3.0.1/bin/Rscript.exe', PATH_OUT+'chisquared_test_coev_hcss.r'] 
		proc=Popen(cmd,stdout=PIPE,stderr=PIPE)
		outf=open(f_10_fas.split("/")[-1].split(".")[0]+"_hcss_pvl.txt",'w')
		(output, error)=proc.communicate()
		return_code = proc.wait()
		if return_code != 0:
			sys.stderr.write('Error running R \n---------------\n\n\n')
			sys.stderr.write(error)
		else :    
			print f_10_fas.split("/")[-1].split(".")[0]+"_hcss_pvl.txt"
			print output  
			outf.write(output)
		outf.close()


#================================================== main program =====================================================		
#List of different datasets 
lx=[1,4,5,7,8,9,11,12,14,15,16,17,18,19,20,21,22,23,24,25,26,27]

#Running all datasets.
for i in lx:
	get_coevolution_info (PATH_LIST[i],LsArgAll[i][0],LsArgAll[i][1],LsArgAll[i][2])
print 'done'
